cd ..
/home/msi/projects/diplomka
import scipy.stats as st
from pprint import pprint
import glob
import os
import numpy as np
import pandas as pd
from tqdm import tqdm
from scipy.stats import hmean
from collections import defaultdict
from traced.trace_monitor import SiteTraceMonitor
from traced.models import NormalModel
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.io as pio
pio.renderers.default = "plotly_mimetype+notebook"
import plotly
plotly.offline.init_notebook_mode()
dataset = os.listdir('data/sites/')[4]
dataset
'CA_SFU_T2-CSCS_LCG2'
files = sorted(glob.glob(f"data/sites/{dataset}/*"))
src_site, dest_site = dataset.split('-')
stm = SiteTraceMonitor(src_site, dest_site)
stm.process(files)
0%| | 0/44635 [00:00<?, ?it/s]100%|██████████| 44635/44635 [00:36<00:00, 1239.55it/s]
ip_pair = 1
data = stm.trace_models[
list(stm.trace_models.keys())[ip_pair]
].detected
Parse from temporary container to dataframe¶
df = pd.DataFrame.from_dict(data).T
df['ts'] = pd.to_datetime(df['ts'], unit='ms')
df.set_index('ts', inplace=True)
df['n_hops_prob'] = df['n_hops'].apply(lambda x: x[1])
df['n_hops_expected'] = df['n_hops'].apply(lambda x: x[2])
df['n_hops_observed'] = df['n_hops'].apply(lambda x: x[3])
df['ip_probs'] = df['ip'].apply(lambda x: x[1])
df['asn_probs'] = df['asn'].apply(lambda x: x[1])
df['path_prob'] = df['path'].apply(lambda x : x[1])
df['reached'] = df['reached'].apply(lambda x: x[1])
df['rtt_anoms'] = df['rtts'].apply(lambda x: x[0])
df['rtt_n_anoms'] = df['rtts'].apply(lambda x: x[1])
df['rtt_z_scores'] = df['rtts'].apply(lambda x: x[2])
df['rtt_expected'] = df['rtts'].apply(lambda x: x[3])
df['rtt_observed'] = df['rtts'].apply(lambda x: x[4])
df['rtt_sigma'] = df['rtts'].apply(lambda x: x[5])
df['ttl_anoms'] = df['ttls'].apply(lambda x: x[0])
df['ttl_probs'] = df['ttls'].apply(lambda x: x[1])
df['ttl_expected'] = df['ttls'].apply(lambda x: x[2])
df['ttl_observed'] = df['ttls'].apply(lambda x: x[3])
df['ttl_n_anoms'] = df['ttls'].apply(lambda x: x[4])
df.drop(columns=['has_anomaly', 'anom_share', 'ttls', 'rtts', 'path','n_hops', 'ip', 'asn'], inplace=True)
Outlier sequence weights¶
The goal of the model averaging is to combine the infromation obtained from a set of models. Usually either the predictions, or the parameters of the model are combined.
Some of the most common methods are:
normal averaging
- values are averages using $\frac{1}{N}\sum^N_{i=0}{x_i}=\sum^N_{i=0}{\frac{1}{N} \cdot x_i}$
- we can interpret this as each model being equally important
weighted averaging
- we assign weights for each model in order to scale its influence $\sum^N_{i=0}{w_i x_i}$
- I use normalized weights, in order to reduce bias and smooth out the result
estimate weights with another model
- we can use another model to train the weights, however this is mostly used fro supervised learning
I decided to take try the normal and weighted averaging approach and try to find a way to automatically weight the models.
I tried to come up with few different types of weights based on the number of outliers and information of the current outlier state in the record.
For example, for some theoretical traceroute, let's say that the we have these values:
| X | E | X-E | X/E | |
|---|---|---|---|---|
| 0 | 7.37 | 6.52 | 0.85 | 1.13 |
| 1 | 1.28 | 4.22 | -2.94 | 0.30 |
| 2 | 10.98 | 2.72 | 8.26 | 4.03 |
| 3 | 2.57 | 7.96 | -5.39 | 0.32 |
| 4 | 6.29 | 1.74 | 4.55 | 3.61 |
Where:
- X represents the RTT
- E is the expected RTT
- X-E is the error of prediction
- X/E is the ratio of the current meassurement to the expected one
After some testing I decided to use the ratio value for as it had some nice attributes. For example the value in an ideal case would be a constant (1), and if you compare error and this rate meassure (for first and third row), you can see that the error was rather different, but since the rate is proportional to the expected value, we can see that both of them are roughly 0.7 times better than expected.
Outlier sequences¶
The first thing I tried to achieve was to be able to reduce the redundant information in the meassurement by gradually decreasing the weights of values that are in a sequence of models that are alerting for an outlier.
So I extract all subsequences based on detected outiers and then assign the first one with the value equal to the length of the sequence, and for each subsequent index, the value is reduced by 1. The rest of them are assigned with weight 1 and the weights are then normalized.
For example:
| is anomaly | sequence_id | sequence weigth | |
|---|---|---|---|
| 0 | False | 0 | 1 |
| 1 | True | 1 | 2 |
| 2 | True | 1 | 1 |
| 3 | False | 0 | 1 |
| 4 | True | 2 | 2 |
| 5 | True | 2 | 1 |
| 6 | False | 0 | 1 |
| 7 | True | 3 | 3 |
| 8 | True | 3 | 2 |
| 9 | True | 3 | 1 |
Equalizing the anomality¶
Another goal for me was to incorporate the number of anomalies measured for some router, but in some cases, when there are routers on which we detect a lot of anomalies, they would end up with weight far greater compared to devices who are almost non-anomalous.
Hence I decided to use the logarithm of the number of outliers observed over time for each of the router currently processed:
| anomalies | weight | log_anomalies | weight_log | |
|---|---|---|---|---|
| 0 | 504.0 | 0.096092 | 6.222576 | 0.107282 |
| 1 | 4.0 | 0.000763 | 1.386294 | 0.023901 |
| 2 | 391.0 | 0.074547 | 5.968708 | 0.102905 |
| 3 | 867.0 | 0.165300 | 6.765039 | 0.116635 |
| 4 | 224.0 | 0.042707 | 5.411646 | 0.093301 |
| 5 | 764.0 | 0.145663 | 6.638568 | 0.114454 |
| 6 | 460.0 | 0.087703 | 6.131226 | 0.105707 |
| 7 | 600.0 | 0.114395 | 6.396930 | 0.110288 |
| 8 | 895.0 | 0.170639 | 6.796824 | 0.117183 |
| 9 | 536.0 | 0.102193 | 6.284134 | 0.108344 |
Mixing the weights¶
I also decided to try and mix all of the previously mentioned weight vectors.
I calculated their average and the harmonic mean.
def extract_sequences(indexes):
'''Extracts sequences of consecutive indexes from a list of indexes.'''
seq = []
curr = indexes[0]
tmp = [curr]
for i in indexes[1:]:
if i == curr + 1:
tmp.append(i)
else:
seq.append(tmp)
tmp = [i]
curr = i
if tmp:
seq.append(tmp)
return seq
def calculate_seq_weights(anoms):
'''Calculates weights for each anomaly based on the length of the sequence of anomalies it belongs to.'''
indexes = np.where(anoms)[0]
if len(indexes) == 0:
return np.ones(anoms.shape[0])
seq = extract_sequences(indexes)
final = np.ones(anoms.shape[0])
for s in seq:
l = len(s)
for i in s:
final[i] += l
l-=1
return final
def calculate_error_rate(E, X):
values = [ ]
for mu, x in zip(E, X):
values.append((1+x)/(1+mu))
return values
def calculate_error_difference(E, X):
values = [ ]
for mu, x in zip(E, X):
values.append(x-mu)
return values
df['rtt_error_rate'] = df.apply(lambda row: calculate_error_rate(row['rtt_expected'], row['rtt_observed']), axis=1)
df['rtt_error_diff'] = df.apply(lambda row: calculate_error_difference(row['rtt_expected'], row['rtt_observed']), axis=1)
df['mean_rtt_error_rate'] = df['rtt_error_rate'].apply(np.mean)
df['sum_rtt_error'] = df['rtt_error_diff'].apply(np.sum)
Total error of RTT time¶
df['sum_rtt_error'].rolling('1H').apply(np.sum).plot(backend='plotly', title='Sum of RTT errors over time (1H mean)')
rtt_mean_z = []
rtt_anom_z = []
rtt_seq_z = []
rtt_hmix_z = []
rtt_mix_z = []
seq_ws = []
seq_idxs = []
total_anoms = 1
index_cnt = defaultdict(lambda : 1)
for i, (ts, x) in tqdm(enumerate(df.iterrows()), total=df.shape[0]):
anomalies = np.array(x['rtt_anoms'])
anom_w = np.array(x['rtt_n_anoms'])
anom_w = np.log1p(anom_w)#** 2
anom_w /= anom_w.sum()
uniform_w = np.ones(anomalies.shape[0])
uniform_w /= np.sum(uniform_w)
seq_w = calculate_seq_weights(anomalies)
seq_w /= np.sum(seq_w)
v = np.array(x['rtt_error_rate'])
rtt_mean_z.append(v.mean())
rtt_anom_z.append(v @ anom_w.T)
rtt_seq_z.append(v @ seq_w.T)
rtt_hmix_z.append(v @ hmean([uniform_w, anom_w, seq_w], axis=0).T)
rtt_mix_z.append(v @ np.mean([uniform_w, anom_w, seq_w], axis=0).T)
100%|██████████| 22185/22185 [00:13<00:00, 1672.41it/s]
rtt_anom_df = pd.DataFrame(data=list(zip(rtt_mean_z, rtt_anom_z, rtt_seq_z ,rtt_hmix_z, rtt_mix_z)), index=df.index, columns=['mean', 'anom', 'seq', 'hmix', 'mix'])
Weighted RTT rate¶
rtt_anom_df.plot(backend='plotly', title='RTT error rates')
Processing the result¶
I then use the selected weighted score as an input to the Normal model in order to detect bad traceroutes in terms of the RTT.
Mean of weights¶
model = NormalModel('a', 'b', one_sided=True, sigma_factor = 4, mu_0=1, sigma_0=1, alpha_0=1, beta_0=.1)
for i, x in rtt_anom_df['mix'].items():
model.log(i.timestamp()*1000, x)
fig= plt.figure(figsize=(12, 5))
ax = plt.gca()
model.plot(ax = ax)
fig.suptitle(f"Weighted Average of Model for RTT error rate [{src_site} → {dest_site}]");
ax.set_title(f"#anomalies: {model.n_anomalies}, contamination: {100*model.n_anomalies/model.n:.1f}%,"
f"mean error per trace: {df[model.to_frame().anomalies]['sum_rtt_error'].mean():.2f}s, "
f"sum error: {df[model.to_frame().anomalies]['sum_rtt_error'].sum():.2f}s")
plt.show()
Harmonic Mean of weights¶
model = NormalModel('a', 'b', one_sided=True, sigma_factor = 4, mu_0=1, sigma_0=1, alpha_0=1, beta_0=.1)
for i, x in rtt_anom_df['hmix'].items():
model.log(i.timestamp()*1000, x)
fig= plt.figure(figsize=(12, 5))
ax = plt.gca()
model.plot(ax = ax)
fig.suptitle(f"Weighted Average of Model for RTT error rate [{src_site} → {dest_site}]");
ax.set_title(f"#anomalies: {model.n_anomalies}, contamination: {100*model.n_anomalies/model.n:.1f}%,"
f"mean error per trace: {df[model.to_frame().anomalies]['sum_rtt_error'].mean():.2f}s, "
f"sum error: {df[model.to_frame().anomalies]['sum_rtt_error'].sum():.2f}s")
plt.show()
tmp = model.to_frame()[['observed', 'anomalies']]
tmp['anoms'] = np.nan
tmp.loc[tmp['anomalies'],'anoms']=tmp.loc[tmp['anomalies']]['observed']
df['sum_rtt_error'] = df['rtt_error_diff'].apply(np.sum)
interactive plot with real errors¶
import plotly.graph_objects as go
fig = go.Figure(data=[
go.Scatter(x=tmp.index, y=tmp.observed, mode='lines', customdata=df['sum_rtt_error'], name="RTT error"),
go.Scatter(x=tmp[tmp.anomalies].index, y=tmp[tmp.anomalies].observed, mode='markers', name="anomaly", customdata=df[tmp.anomalies]['sum_rtt_error'])],
)
fig.update_traces(hovertemplate='Date:%{x}<br>value:%{y}<br>Time error:%{customdata:.2f}')
fig.update_layout(
title=f"RTT errors on {dataset} [{'->'.join(list(stm.trace_models.keys())[ip_pair])}]<br>count:{df[tmp.anomalies].shape[0]}, mean:{df[tmp.anomalies]['sum_rtt_error'].mean():.2f}s, cumulative:{df[tmp.anomalies]['sum_rtt_error'].sum():.2f}s",
xaxis_title="Date",
yaxis_title="RTT error rate"
)
fig.show()
TTL¶
I tried the same approach with TTL values
df['ttl_error_rate'] = df.apply(lambda row: calculate_error_rate(row['ttl_expected'], row['ttl_observed']), axis=1)
df['ttl_error_diff'] = df.apply(lambda row: calculate_error_difference(row['ttl_expected'], row['ttl_observed']), axis=1)
ttl_mean_z = []
ttl_anom_z = []
ttl_seq_z = []
ttl_hmix_z = []
ttl_mix_z = []
total_anoms = 1
index_cnt = defaultdict(lambda : 1)
for i, (ts, x) in tqdm(enumerate(df.iterrows()), total=df.shape[0]):
anomalies = np.array(x['ttl_anoms'])
anom_w = np.array(x['ttl_n_anoms'])
anom_w = np.log1p(anom_w+1).astype(np.float64)
anom_w /= anom_w.sum()
uniform_w = np.ones(anomalies.shape[0])
uniform_w /= np.sum(uniform_w)
seq_w = calculate_seq_weights(anomalies)
seq_w /= np.sum(seq_w)
v = np.array(x['ttl_error_rate'])
ttl_mean_z.append(v.mean())
ttl_anom_z.append(v @ anom_w.T)
ttl_seq_z.append(v @ seq_w.T)
ttl_hmix_z.append(v @ hmean([uniform_w, anom_w, seq_w], axis=0).T)
ttl_mix_z.append(v @ np.mean([uniform_w, anom_w, seq_w], axis=0).T)
100%|██████████| 22185/22185 [00:18<00:00, 1185.84it/s]
ttl_anom_df = pd.DataFrame(data=list(zip(ttl_mean_z, ttl_anom_z, ttl_seq_z ,ttl_hmix_z, ttl_mix_z)), index=df.index, columns=['mean', 'anom', 'seq', 'hmix', 'mix'])
ttl_anom_df.plot(backend='plotly', title='TTL error rates')
ax = df['ttl_error_rate'].apply(np.sum).hist(bins=100)
ax.set_title('Histogram of TTL error rate');
model = NormalModel('a', 'b', one_sided=True, gamma = 1, sigma_factor = 4, mu_0=1)
for i, x in ttl_anom_df['hmix'].items():
model.log(i.timestamp()*1000, x)
fig= plt.figure(figsize=(12, 5))
ax = plt.gca()
model.plot(ax = ax)
fig.suptitle(f"Weighted Average of Model for TTL error rate [{src_site} → {dest_site}]");
ax.set_title(f"#anomalies: {model.n_anomalies}, contamination: {100*model.n_anomalies/model.n:.1f}%,"
f"mean error per trace: {df[model.to_frame().anomalies]['ttl_error_diff'].apply(np.sum).mean():.2f}hops, "
f"sum error: {df[model.to_frame().anomalies]['ttl_error_diff'].apply(np.sum).sum():.2f}hops"
)
plt.show()
ax = df.loc[model.to_frame()['anomalies']]['ttl_error_diff'].apply(lambda x:np.sum(np.abs(x))).plot(kind='hist')
ax.set_title('Histogram of TTL errors for detected records')
Text(0.5, 1.0, 'Histogram of TTL errors for detected records')
tmp = model.to_frame()[['observed', 'anomalies']]
tmp['anoms'] = np.nan
tmp.loc[tmp['anomalies'],'anoms']=tmp.loc[tmp['anomalies']]['observed']
df['ttl_error'] = df['ttl_error_diff'].apply(lambda x:np.sum(np.abs(x)))
Interactive plot¶
fig = go.Figure(
data=[
go.Scatter(x=tmp.index, y=tmp.observed, mode='lines', customdata=df['ttl_error'], name="TTL error"),
go.Scatter(x=tmp[tmp.anomalies].index, y=tmp[tmp.anomalies].observed, mode='markers', name="anomaly", customdata=df[tmp.anomalies]['ttl_error'])]
)
fig.update_traces(hovertemplate='Date:%{x}<br>value:%{y}<br>Hops error:%{customdata:.2f}')
fig.update_layout(
title=f"TTL errors on {dataset} [{'->'.join(list(stm.trace_models.keys())[ip_pair])}] <br>count:{df[tmp.anomalies].shape[0]}, mean: {df[tmp.anomalies]['ttl_error'].mean():.2f} hops, cumulative: {df[tmp.anomalies]['ttl_error'].sum():.2f} hops",
xaxis_title="Date",
yaxis_title="TTL error rate",
)
fig.show()